# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
# for filename in filenames:
# print(os.path.join(dirname, filename))
# Any results you write to the current directory are saved as output.
As we have 2 datasets, we need to work a little bit to merge and selected data from each of them to create our own dataset.
fpGlobal = "../input/covid19-global-forecasting-week-3/train.csv"
gl = pd.read_csv(fpGlobal)
fpEnr = "/kaggle/input/covid19-enriched-dataset-week-2/enriched_covid_19_week_2.csv"
enr = pd.read_csv(fpEnr)
fpPop = "/kaggle/input/covcsd-covid19-countries-statistical-dataset/"
#pop = pd.read_csv(fpPop)
gl.tail()
#train.info()
enr.head()
enr.columns
enr.loc[:,"Country_Region"].unique() ## 293
Lets see what we have as columns in our dataset:
import os
s = len(fpPop)
af = pd.read_csv("../input/covcsd-covid19-countries-statistical-dataset/Afghanistan_COVID19.csv")
af.head()
print(af.columns)
print(enr.columns)
af.head()
checking the temperature variation for Afghanistan:
af = af.drop_duplicates()
maxt = af.Temperature.max()
mint = af.Temperature.min()
print(f"max = {maxt}, min = {mint}")
af.Temperature.unique()
af["Wind_speed"].unique()
qu = enr.loc[:,["Country_Region", "restrictions", "quarentine"]]
qu.rename(columns={"Country_Region":"Country"}, inplace =True)
print(qu.head())
print("length qu : ", len(qu))
Lets first select the data for just one country.
af = af.merge(qu_Af, on="Country", how = "left")
af.tail()
# ang = pd.read_csv("../input/covcsd-covid19-countries-statistical-dataset/Angola_COVID19.csv")
# ang.head()
# new = qu.loc[qu["Country"]=="ahahaha"]
# print(new["Country"])
# len(new["Country"])
Finally, lets grab all the files from our statistical dataset that are also in the enriched dataset, so we will not have missing values.
import re
countriesDf = {}
not_in = []
i = 0
for filename in os.listdir(fpPop):
if filename != "temperature_data.csv" and filename != "Tanzania.csv":
#print("our filename: ", filename)
curr_df = pd.read_csv("../input/covcsd-covid19-countries-statistical-dataset/"+filename)
res = re.split('_|.csv',filename)
new = qu.loc[qu["Country"]==res[0]]
if len(new["Country"]):
#print("our new : ", new)
curr_df = curr_df.merge(new, on="Country", how = "left")
countriesDf[res[0]] = curr_df
#print("our curr_df : ", curr_df)
else:
print("Dieser Lände ist nicht dort in der ENR")
not_in.append(filename)
print("Number of countries considered : ", len(countriesDf))
def heat(curr):
n = curr.Cumulative_cases.max()
data = []
[data.append([curr.Latitude[0],curr.Longitude[0],n,curr.Country[0]]) for i in range(n)]
m = pd.DataFrame(data, columns = ["Latitude",'Longitude',"Cumulative_cases","Country"])
return m
from datetime import datetime
new = pd.DataFrame()
heat_df = pd.DataFrame()
i = 0
for name in countriesDf.keys():
#print("country : ", name)
curr = countriesDf[name].copy()
curr = curr.drop_duplicates() ###############################################" !!!!!!!!!!!!!!!!!!!"
#### ######
'''
This is only because there is some files that doesn't have the first colunm name. We could manually fix them or add the columns,
columns, but as this was just for one file, I chose to not consider the file.
'''
if "Date" not in curr.columns :
print(f"The country {name} has no Date column!")
try:
curr.rename(columns={" Date":"Date"},inplace=True)
print("We fixed!")
except:
print("ich weiss nicht was passiert!")
### ######
curr['Date'] = pd.to_datetime(curr['Date'])
curr['Date'] = curr['Date'].dt.strftime('%d-%m-%Y')
date = curr["Date"].apply(lambda x : datetime.strptime(x, "%d-%m-%Y"))
curr["Month"] = date.apply(lambda x : x.month)
curr["Day"] = date.apply(lambda x : x.day)
curr_is = curr.loc[:,["Country", "Date", "Cumulative_cases",'Cumulative_death','Population Density/km','Latitude','Longitude','Temperature',"Median_Age", "Life Expectancy (M)","Lung Patients (M)"]]
curr_is['NewDate'] = pd.to_datetime(curr_is['Date']) ## creating "date" timestamp from the "Date" string
curr_is['NewDate'] = curr_is['NewDate'].dt.strftime('%d-%m-%Y')
curr_is['log_ConfirmedCases'] = np.log(curr_is.Cumulative_cases + 1)
curr_is['log_deaths'] = np.log(curr_is.Cumulative_death + 1)
# if i == 10:
# break
new = pd.concat([new,curr_is], ignore_index=True)
#i += 1
#### To heatmap:
heat_df = pd.concat([heat_df,heat(curr)], ignore_index=True)
print("size : ", len(new))
print("number countries : ", new.Country.unique())
#new.loc[len(new)/2-10:len(new)/2+10,:]
heat_df.tail()
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.offline as py
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import glob
py.init_notebook_mode(connected=True)
#Plotting the figure
fig = px.choropleth(new, locations="Country", locationmode='country names',
color="log_ConfirmedCases", hover_name="Country",projection="natural earth" ,#"mercator",
animation_frame="NewDate",width=1000, height=800,
color_continuous_scale=px.colors.sequential.Viridis,
title='COVID-19 Cases Across World')
#Showing the figure
fig.update(layout_coloraxis_showscale=True)
py.offline.iplot(fig)
Hereby, the heatmap for the Confirmed Cases. The heatmap is just another representation of geographical density of some property. In this case, the number of Confirmed Cases for the lastest day considered in our dataset.
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster
m_5 = folium.Map(location=[42.32,-71.0589], tiles='cartodbpositron', zoom_start=2)
HeatMap(data=heat_df[['Latitude','Longitude']], radius=10).add_to(m_5)
m_5
df = new.copy()
import plotly.express as px
fig = px.bar(df, x="Country", y="log_ConfirmedCases", color="Country",
animation_frame="NewDate", animation_group="Country", range_y=[0,12])
fig.show()
Is quite difficult to get a good visualization with all the countries considered. Lets see only for some of them. (You can change the countries as you want)
Only for 4 european countries:
eur = ["Italy","France","Germany","Belgium"]
ex = df[df.Country.isin(eur)]
fig = px.bar(ex, x="Country", y="log_ConfirmedCases", color="Country",
animation_frame="NewDate", animation_group="Country", range_y=[0,12])
fig.show()
Life Expectancy :
eur = ["Italy","Afghanistan","Ukraine","Argentina","Jamaica","Albania","Thailand","Togo"]
ex = df[df.Country.isin(eur)]
ex.head()
Does the temperature play an important role for the virus dissemination? The size of the bubles is the number of Confirmed Cases (this is why it increases over the time)
px.scatter(ex, x="Temperature", y="Life Expectancy (M)", animation_frame="NewDate", animation_group="Country",
size="log_ConfirmedCases", color="Country", hover_name="Country",
log_x=False, size_max=55, range_x=[-8,35], range_y=[40,90])